library(data.table)
library(dplyr)
library(zoo)
library(lubridate)
library(ggplot2)
library(readr)
library(plotly)
library(RColorBrewer)
library(grDevices)
library(knitr)

####READING DATA
#Data files used were made by using Python code (file Project_notebook.ipynb) and/or was sent by the client

#Reservations: 2 datafiles
res_view_old <- fread("Reservations_view.csv")[,c(2,3,4,6,7,8,9,14,16,28,31,34,35,36,41)]
res_view <- fread("Res_new.csv")
res_view <- res_view[,c(2,3,4,6,7,8,9,18,20,32,35,38,39,40,45)]

reservations <- merge(res_view_old,res_view,all=TRUE)
reservations <- reservations[!duplicated(reservations$rate_quote_uuid),]
write.csv(reservations,"reservations.csv")


#Rate quotes filtered & dup
#dup - no duplicates by request uuid
#filtered - no duplicates and removed very similar requests (gone through phase 2)

rateq1_filtered <- fread("filtered_rate_quote_1.csv")
rateq2_filtered <- fread("filtered_rate_quote_2.csv")
rateq3_filtered <- fread("filtered_rate_quote_3.csv")
rateq4_filtered <- fread("filtered_rate_quote_4.csv")
rateq_filtered <- merge(merge(merge(rateq1_filtered,rateq2_filtered,all=TRUE),rateq3_filtered,all=TRUE),rateq4_filtered,all=TRUE)
remove(rateq1_filtered,rateq2_filtered,rateq3_filtered,rateq4_filtered)

rateq1_dup <- fread("rate_quote_1_dup1.csv")
rateq2_dup <- fread("rate_quote_2_dup1.csv")
rateq3_dup <- fread("rate_quote_3_dup1.csv")
rateq4_dup <- fread("rate_quote_4_dup1.csv")
rateq_dup <- merge(merge(merge(rateq1_dup,rateq2_dup,all=TRUE),rateq3_dup,all=TRUE),rateq4_dup,all=TRUE)
remove(rateq1_dup,rateq2_dup,rateq3_dup,rateq4_dup)

write.csv(rateq_dup,"rateq_dup.csv")
write.csv(rateq_filtered,"rateq_filtered.csv")
hotdays_rq_nodupl_brokers_byday <- fread("hotdays_rq_nodupl.csv")
hotdays_rq_nodupl_brokers_byweek <- fread("hotdays_rq_nodupl_week.csv")
hotdays_reserv_brokers_byday <- fread("hotdays_reserv_broker.csv")
hotdays_reserv_statustype_byday <- fread("hotdays_reserv_statustype.csv")
hotdays_reserv_acriss_byday <-fread("hotdays_reserv_acriss.csv")

Overview of how many days we have in the rate quote (duplicated) data.

tabel_days <- rateq_dup %>% group_by(as.Date(timestamp)) %>% summarise(min=min(timestamp),max=max(timestamp))
kable(tabel_days,caption = "Rate quotes")
Rate quotes
as.Date(timestamp) min max
2017-12-01 2017-12-01 00:00:39.911 2017-12-01 23:59:42.076
2017-12-02 2017-12-02 00:00:21.101 2017-12-02 23:59:53.040
2017-12-03 2017-12-03 00:00:04.960 2017-12-03 20:57:14.210
2017-12-05 2017-12-05 00:00:00.060 2017-12-05 23:59:48.232
2017-12-06 2017-12-06 00:00:01.880 2017-12-06 23:59:36.254
2017-12-07 2017-12-07 00:00:06.907 2017-12-07 09:24:34.903
2017-12-11 2017-12-11 04:00:28.030 2017-12-11 21:27:35.617
2017-12-12 2017-12-12 03:36:12.871 2017-12-12 20:19:27.911
2017-12-13 2017-12-13 00:00:00.039 2017-12-13 23:59:53.457
2017-12-14 2017-12-14 00:00:08.780 2017-12-14 23:59:56.727
2017-12-15 2017-12-15 00:00:01.885 2017-12-15 21:19:09.761

Removing date 2017-12-07, due to the fact that it contains only data with the time period of 9 hours, which is clearly a lot different from other days.

Overview of the reservations data.

tabel_days2 <- reservations %>% group_by(date=as.Date(rate_request_timestamp)) %>% summarise(count=n())
p_tabel_days2 <- tabel_days2 %>% plot_ly(x=~date,y=~count) %>% add_lines() %>% layout(title="Reservations made")
p_tabel_days2

We can see that there are not many reservations done before 27th of February. For further illustrations, let’s cut this part of the data out.

Filtration

#Removing date 2017-12-07
rateq_dup <- rateq_dup[!(as.Date(timestamp)=="2017-12-07"),]
rateq_filtered <- rateq_filtered[!(as.Date(timestamp)=="2017-12-07"),]
reservations <- reservations[as.Date(reservations$rate_request_timestamp) >="2017-02-27",]

one <- rateq_dup[,2]
one$datatype <- "Duplicated by uuid"
two <- rateq_filtered[,3]
two$datatype <- "Filtered"
together<- merge(one,two,all=TRUE)


#REQUESTS-----------------

#in case a log scale is needed
#together <-together %>% group_by(datatype,as.factor(as.Date(timestamp))) %>% summarise(count=n())
#ggplot(together, aes(x=`as.factor(as.Date(timestamp))`,y=count,group=datatype,colour=datatype))+geom_line()

p_distr_notchanging_plot <-ggplot(together,aes(x=as.factor(as.Date(timestamp)),group=datatype,colour=datatype))+labs(xlab="",title="Requests per day")+theme_bw()+geom_line(size=1.25,stat="count")+theme(legend.text = element_text(size=12),legend.title = element_blank(),axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom",plot.title=element_text(hjust=0.5),axis.title.x=element_blank(),panel.grid.minor = element_blank())
p_distr_notchanging_plot

One can see that the distribution is pretty similar between phase 1 and phase 2 filtration. Phase 1 - removing duplicated rows by uuid (red line) and Phase 2 - similar requests removed after removing duplicated rows by uuid.

Days in advance

modif_rate <- rateq_dup
modif_rate$timestamp <- as.Date(modif_rate$timestamp)
modif_rate$pickup_timestamp <- as.Date(modif_rate$pickup_timestamp)
modif_rate$return_timestamp <- as.Date(modif_rate$return_timestamp)
modif_rate$days_in_advance <- modif_rate$pickup_timestamp - modif_rate$timestamp

p_days_in_advance_dist <-ggplot(modif_rate,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+theme_bw()+
  labs(x="Days in advance",title="Price checking")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
  theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_days_in_advance_dist

This plot shows how many days in advance customers are checking the prices. We can see that the most popular is first two weeks or three weeks. We can also see a slight bump near 200 days, which is logical because the rate quotes here have been made in december, so ~200 days ahead means checking prices for the summer that is the most popular vacation season. However since we are dealing with data from Iceland, one can see that winter times are very popular as well!

#In case there is more data and one would like to see the same plot within seasons
yq <- as.yearqtr(as.yearmon(modif_rate$timestamp, "%m/%d/%Y") + 1/12)
modif_rate$Season <- factor(format(yq, "%q"), levels = 1:4, labels = c("winter", "spring", "summer", "fall"))

p_by_season_facets <-ggplot(modif_rate,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+facet_wrap(~Season,nrow=2)+theme_bw()+labs(x="Days in advance",title="Price checking by season")+scale_x_continuous(breaks=c(seq(0,1000,50)))+theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_by_season_facets
modif_res <- reservations
modif_res$rate_request_timestamp <- as.Date(modif_res$rate_request_timestamp)
modif_res$pickup_timestamp <- as.Date(modif_res$pickup_timestamp)
modif_res$return_timestamp <- as.Date(modif_res$return_timestamp)
modif_res$days_in_advance <- modif_res$pickup_timestamp - modif_res$rate_request_timestamp

p_days_in_advance_dist_reserv <-ggplot(modif_res,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+theme_bw()+
  labs(x="Days in advance",title="Reservations made")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
  theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5))
p_days_in_advance_dist_reserv

The plot for reservations made however is slightly different than the price checking plot. Let’s look at the days in advance distribution by seasons when the reservation requests were made.

#getting season in case needed
yq <- as.yearqtr(as.yearmon(modif_res$rate_request_timestamp, "%m/%d/%Y") + 1/12)
modif_res$Season <- factor(format(yq, "%q"), levels = 1:4, labels = c("WINTER", "SPRING", "SUMMER", "FALL"))

p_by_season_facets_reserv <-ggplot(modif_res,aes(x=days_in_advance))+geom_histogram(binwidth = 10,colour="black",fill=rgb(0.1,0.67,0.674))+facet_wrap(~Season,nrow=2)+theme_bw()+labs(x="Days in advance",title="Reservations requests made by season")+scale_x_continuous(breaks=c(seq(0,1000,50)))+theme(panel.grid.minor=element_blank(),legend.position = "none",plot.title=element_text(hjust=0.5),panel.grid.major.x=element_blank())
p_by_season_facets_reserv

One can notice that Winter and Fall are distributed very similarly. One can notice a bump in spring for 100-150 days advance, which means that these reservations are made for the summer season.

bybroker_data_req <-rateq_filtered %>% filter(Broker_name != "RateChain test")%>%group_by(Broker_name,as.factor(as.Date(timestamp))) %>% summarise(count=n())
bybroker_data_req$`as.factor(as.Date(timestamp))` <- format(as.Date(bybroker_data_req$`as.factor(as.Date(timestamp))`,"%Y-%m-%d"),format="%d. %b")
picked_colours <- brewer.pal(length(unique(bybroker_data_req$Broker_name)),"Spectral")
p_bybroker <- bybroker_data_req %>% plot_ly(x=~`as.factor(as.Date(timestamp))`,y=~count,type="bar",color=~Broker_name,colors=picked_colours) %>%
  layout(barmode="stack",title="Rate quotes by brokers",xaxis=list(title="",tickangle=0),legend=list(y=0.5))
p_bybroker
#log_scale; but hover shows count not log10(count)
p_bybroker_logscale <- bybroker_data_req %>% plot_ly(x=~`as.factor(as.Date(timestamp))`,y=~log10(count),type="bar",color=~Broker_name,colors=picked_colours,hoverinfo="text",text=~paste0(Broker_name,":\t",count)) %>%
  layout(barmode="stack",title="Requests by brokers",xaxis=list(title="",tickangle=0),legend=list(y=0.5))
#p_bybroker_logscale

TravelJigsaw has the most rate quotes, then BSP Auto and after that SupplierWebsite.

bybroker_data_res <- reservations %>% group_by(date=as.Date(rate_request_timestamp),`Broker name`) %>% summarise(count=n())
picked_colours <- brewer.pal(length(unique(bybroker_data_res$`Broker name`)),"Spectral")
p_bybroker_res <- bybroker_data_res %>% plot_ly(x=~date,y=~count,type="bar",color=~`Broker name`,colors=picked_colours) %>%
  layout(barmode="stack",title="Reservations by brokers",xaxis=list(title="Reservation request date",tickangle=0,type="date"),legend=list(y=0.5))
p_bybroker_res

One can notice that TravelJigsaw still has the most reservations, but the second place goes to SupplierWebsite, not BSP Auto as it was in rate quote data. Seems like SupplierWebsite first appeared in July 2017 and covers a significant percentage of reservations.

Hot days

#HOTDAYS-----------------
#Rate quotes
#by day

hotdays_rq_nodupl_brokers_byday <- hotdays_rq_nodupl_brokers_byday[,-6]#removing ratechain test
hotdays_rq_nodupl_brokers_byday$TravelJigsaw <- as.numeric(hotdays_rq_nodupl_brokers_byday$TravelJigsaw)
hotdays_rq_nodupl_brokers_byday <- melt(hotdays_rq_nodupl_brokers_byday)
hotdays_rq_nodupl_brokers_byday[is.na(hotdays_rq_nodupl_brokers_byday)] <- 0

picked_colours <- brewer.pal(length(unique(hotdays_rq_nodupl_brokers_byday$variable)),"Spectral")
p_hotdays_rq_brokers_byday <- hotdays_rq_nodupl_brokers_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>% 
  add_lines() %>% layout(title="Hot days (rate quotes) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of requests"))
p_hotdays_rq_brokers_byday
hotdays_rq_nodupl_brokers_byweek <- hotdays_rq_nodupl_brokers_byweek[,-6]#removing ratechain test
hotdays_rq_nodupl_brokers_byweek$TravelJigsaw <- as.numeric(hotdays_rq_nodupl_brokers_byweek$TravelJigsaw)
hotdays_rq_nodupl_brokers_byweek <- melt(hotdays_rq_nodupl_brokers_byweek)
hotdays_rq_nodupl_brokers_byweek[is.na(hotdays_rq_nodupl_brokers_byweek)] <- 0

picked_colours <- brewer.pal(length(unique(hotdays_rq_nodupl_brokers_byweek$variable)),"Spectral")
p_hotdays_rq_brokers_byweek <- hotdays_rq_nodupl_brokers_byweek %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>% 
  add_lines() %>% layout(title="Hot weeks (rate quotes) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of requests"))
p_hotdays_rq_brokers_byweek
hotdays_reserv_brokers_byday <- hotdays_reserv_brokers_byday[,-6]#removing ratechain test
hotdays_reserv_brokers_byday <- melt(hotdays_reserv_brokers_byday)
hotdays_reserv_brokers_byday[is.na(hotdays_reserv_brokers_byday)] <- 0
hotdays_reserv_brokers_byday <-hotdays_reserv_brokers_byday[as.Date(hotdays_reserv_brokers_byday$timestamp) >="2017-02-27",]

picked_colours <- brewer.pal(length(unique(hotdays_reserv_brokers_byday$variable)),"Spectral")
p_hotdays_reserv_brokers_byday <- hotdays_reserv_brokers_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours) %>% 
  add_lines() %>% layout(title="Hot days (reservations) by brokers",xaxis=list(title="",type="date"),legend=list(y=0.5),yaxis=list(title="count of reservations"))
p_hotdays_reserv_brokers_byday
hotdays_reserv_acriss_byday <- melt(hotdays_reserv_acriss_byday)
hotdays_reserv_acriss_byday[is.na(hotdays_reserv_acriss_byday)] <- 0

picked_colours <- brewer.pal(length(unique(hotdays_reserv_acriss_byday$variable)),"Spectral")
hotdays_reserv_acriss_byday <-hotdays_reserv_acriss_byday[as.Date(hotdays_reserv_acriss_byday$timestamp) >="2017-02-27",]
p_hotdays_reserv_acriss_byday <- hotdays_reserv_acriss_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours,type="bar") %>% 
  layout(barmode="stack",title="Hot days (reservations) by car type",xaxis=list(title="",type="date"),legend=list(y=0.5,x=0.95),yaxis=list(title="count of reservations"))
p_hotdays_reserv_acriss_byday
#In order to save a html widget 
#htmlwidgets::saveWidget(p_hotdays_reserv_acriss_byday,"hotdaysbycartype.html")
hotdays_reserv_statustype_byday <- melt(hotdays_reserv_statustype_byday)
hotdays_reserv_statustype_byday[is.na(hotdays_reserv_statustype_byday)] <- 0
hotdays_reserv_statustype_byday <-hotdays_reserv_statustype_byday[as.Date(hotdays_reserv_statustype_byday$timestamp) >="2017-02-27",]
picked_colours <- brewer.pal(length(unique(hotdays_reserv_statustype_byday$variable)),"Spectral")
p_hotdays_reserv_statustype_byday <- hotdays_reserv_statustype_byday %>% plot_ly(x=~timestamp,y=~value,color=~variable,colors=picked_colours,type="bar") %>% 
  layout(barmode="stack",title="Hot days (reservations) by status type",xaxis=list(title="",type="date"),yaxis=list(title="count of reservations"))
p_hotdays_reserv_statustype_byday
#Season and status type

p_by_season_status_type_facets_reserv <-ggplot(modif_res,aes(x=days_in_advance,group=reservation_status_type,fill=reservation_status_type))+geom_histogram(binwidth = 10,position = "stack",colour="black")+
  facet_wrap(~Season,nrow=2)+scale_fill_manual(values=brewer.pal(5,"Spectral"))+theme_bw()+labs(x="Days in advance",title="Reservations made by season",fill="")+scale_x_continuous(breaks=c(seq(0,1000,50)))+
  theme(panel.grid.minor=element_blank(),legend.position = "bottom",plot.title=element_text(hjust=0.5),panel.grid.major.x=element_blank())
p_by_season_status_type_facets_reserv

Rate quotes to reservations ratio

First let’s look at the plot where the count of reservations and the count of rate quotes for the same day are shown. Note that in the particular dataset there are no days when the count of reservations is 0, however there are a couple of days when the count of rate quotes is smaller (and even 0) than the count of reservations. It means that the rate quotes were made before 1st of December in 2017.

Firstly, let’s look at rate quotes data, that has been gone through phase 1 (duplicates by uuid have been removed).

reservations$rate_request_timestamp <- as.Date(reservations$rate_request_timestamp)
reservations$pickup_timestamp <- as.Date(reservations$pickup_timestamp)
res1 <- reservations %>% select(rate_request_timestamp,pickup_timestamp,rental_days,`Broker name`)


res <- data.frame(rate_request_timestamp=as.Date(character()),
                 rented_day_timestamp=as.Date(character()), 
                 Broker_name=character(), 
                 uid = integer(),
                 stringsAsFactors=FALSE) 

lengtht <- nrow(res1)#approx 8 minutes
for(i in 1:lengtht){
  lengtht2 <- res1$rental_days[i]
  for(j in 1:lengtht2){
    temp <- res1[i,]
    res[nrow(res)+1,] <- c(as.character(as.Date(temp$rate_request_timestamp)),as.character(as.Date(temp$pickup_timestamp) + j - 1),temp$`Broker name`,i)
  }
}

hotdays_rq_nodupl_brokers_byday[is.na(hotdays_rq_nodupl_brokers_byday)] <- 0
rq <- melt(hotdays_rq_nodupl_brokers_byday)
rq$timestamp <- as.Date(rq$timestamp)
colnames(rq)[2] <- "Broker_name"
res <- res %>% filter(as.Date(rate_request_timestamp) >= "2017-12-01") %>%group_by(rented_day_timestamp,Broker_name) %>% summarise(count=n())
colnames(res)[1] <- "timestamp" 
rq <- rq[rq$timestamp %in% res$timestamp,]
all <- merge(rq,res,by=c("timestamp","Broker_name"),all.y = TRUE)
all <- all %>% filter(Broker_name != "RateChain test")
print(paste("The data contains rate quotes and reservations to the following time frame:",min(as.Date(all$timestamp)),"to",max(as.Date(all$timestamp)),"."))
## [1] "The data contains rate quotes and reservations to the following time frame: 2017-12-01 to 2018-11-02 ."
test <- all %>% select(value,count,timestamp) %>% group_by(timestamp) %>%summarise(count=sum(count),value=sum(value))
plot1 <- ggplot(test,aes(x=count,y=value))+geom_point()+labs(x="count of reservations",y="count of rate quotes",title="Rate quotes and reservations relation on phase 1 filtrated data")+theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot1

One can see a relation between the two variables. Now let’s look at the relation when we are using rate quote data that has gone through phase 2 filtration.

rq <- melt(hotdays_rq_filtered)
hotdays_rq_filtered[is.na(hotdays_rq_filtered)] <- 0
rq$timestamp <- as.Date(rq$timestamp)
colnames(rq)[2] <- "Broker_name"
rq <- rq[rq$timestamp %in% res$timestamp,]
all <- merge(rq,res,by=c("timestamp","Broker_name"),all.y = TRUE)
all <- all %>% filter(Broker_name != "RateChain test")
test <- all %>% select(value,count,timestamp) %>% group_by(timestamp) %>%summarise(count=sum(count),value=sum(value))
plot2 <-ggplot(test,aes(x=count,y=value))+geom_point()+labs(x="count of reservations",y="count of rate quotes",title="Rate quotes and reservations relation on phase 2 filtrated data")+theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot2

We can see that the y axis values have decreased a lot, but the overall look of the relation is the same. So let’s continue using the filtered data.

plot3 <-all %>% plot_ly(x=~count,y=~value,color=~Broker_name,type="scatter") %>% layout(title="",xaxis=list(title="count of reservations"),yaxis=list(title="count of rate quotes"))
plot3
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Now I can see that TravelJigsaw has the most rate quotes. Let’s calculate the rate quotes to reservation ratio.

all$ratio <- all$value/all$count
all_wo_travel_bsp <- all %>% filter(Broker_name != "TravelJigsaw", Broker_name != "BSP Auto")
plot4 <-ggplot(all_wo_travel_bsp,aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw() + theme(axis.text.x = element_text(angle = 90),plot.title=element_text(hjust=0.5))
plot4

plot5 <- ggplot(all %>% filter(Broker_name == "TravelJigsaw"),aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot5

plot6 <- ggplot(all %>% filter(Broker_name == "BSP Auto"),aes(x=Broker_name,y=ratio)) + geom_boxplot()+labs(x="",title="Rate quote to reservations ratio")+ theme_bw()+theme(plot.title=element_text(hjust=0.5))
plot6

TravelJigsaw has also the most outliers according to the boxplot.